In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.patches as patches
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
In [14]:
# To show all columns when printing
pd.set_option("display.max_columns", None)

df = pd.read_csv(r"/Users/maniisshhhh/Downloads/Healthcare.csv")
df
Out[14]:
Age Gender BMI Glucose BloodPressure Insulin SkinThickness Pregnancies DiabetesPedigreeFunction PhysicalActivityLevel Outcome Country Cholesterol Smoking_Status Family_History Diet_Type Sleep_Hours Daily_Steps Water_Intake_Liters Mental_Health_Status Sleep_Quality Chronic_Disease Work_Stress_Level Alcohol_Consumption Diabetes_Status Diabetes_Type Fast_Food_Intake Prediabetes Fasting_Blood_Sugar Screen_Time
0 56 Female 27.0 76.5 68.90 16.40 8.9 2 1.181 Moderate 0 Australia 189.8 Yes No High-Fat 8.5 4878.0 3.7 Good Fair Thyroid High Regular No Diabetes Type 2 Rarely No 185.4 8.9
1 69 Male 28.9 43.5 66.20 31.90 20.5 2 0.339 Moderate 0 UK 235.4 Yes Yes High-Carb 4.9 5604.0 3.1 Good Fair Heart Disease Moderate Occasional No Diabetes Type 2 Rarely Yes 231.7 1.9
2 46 Female 27.4 82.1 75.25 79.10 16.1 2 1.659 High 1 South Africa 143.7 No No Balanced 7.0 7533.0 2.6 Good Good Thyroid Moderate Regular Diabetes Type 2 Sometimes No 64.8 10.7
3 32 Female 28.9 105.5 75.10 189.80 26.2 7 2.497 Moderate 0 Canada 219.5 Yes No Balanced 6.2 5963.0 2.6 Poor Fair Hypertension Low Occasional No Diabetes Type 2 Frequently No 89.1 6.3
4 60 Male 24.0 84.2 79.00 100.40 24.2 8 1.342 Low 1 UK 219.0 No Yes High-Fat 8.0 3201.0 2.6 Average Good Thyroid Moderate Regular Diabetes Type 2 Frequently No 137.7 4.7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
17995 46 Female 32.3 84.3 86.10 59.50 15.3 6 1.019 High 0 Germany 179.8 No Yes Balanced 6.0 5513.0 1.2 Good Poor Hypertension Low Occasional No Diabetes Type 2 Sometimes No 111.2 6.6
17996 31 Male 28.9 137.2 79.80 87.30 20.0 5 0.329 Low 1 South Africa 101.5 No No High-Carb 6.3 5859.0 3.4 Poor Good Thyroid Low Regular Diabetes Type 2 Rarely Yes 77.4 6.1
17997 21 Male 26.9 108.4 69.90 79.85 32.8 9 0.989 High 1 Germany 229.8 No No Balanced 5.0 5386.0 1.7 Average Fair Thyroid High Occasional Diabetes Type 2 Sometimes Yes 52.0 10.9
17998 61 Male 28.6 139.4 66.30 92.00 20.0 8 1.456 Moderate 0 Canada 214.1 Yes Yes High-Carb 8.2 12200.0 4.1 Poor Fair Hypertension Moderate Occasional No Diabetes Type 2 Rarely No 171.3 9.5
17999 73 Male 28.0 135.4 77.80 115.60 21.3 6 1.398 Moderate 0 Brazil 221.1 No Yes High-Fat 7.3 1208.0 3.3 Average Poor Heart Disease Low Occasional No Diabetes Type 2 Rarely No 141.8 1.9

18000 rows × 30 columns

Basic Dataset Overview¶

In [15]:
print("Shape of dataset:", df.shape)
Shape of dataset: (18000, 30)
In [16]:
print("\nFirst 5 rows:")
print(df.head())
First 5 rows:
   Age  Gender   BMI  Glucose  BloodPressure  Insulin  SkinThickness  \
0   56  Female  27.0     76.5          68.90     16.4            8.9   
1   69    Male  28.9     43.5          66.20     31.9           20.5   
2   46  Female  27.4     82.1          75.25     79.1           16.1   
3   32  Female  28.9    105.5          75.10    189.8           26.2   
4   60    Male  24.0     84.2          79.00    100.4           24.2   

   Pregnancies  DiabetesPedigreeFunction PhysicalActivityLevel  Outcome  \
0            2                     1.181              Moderate        0   
1            2                     0.339              Moderate        0   
2            2                     1.659                  High        1   
3            7                     2.497              Moderate        0   
4            8                     1.342                   Low        1   

        Country  Cholesterol Smoking_Status Family_History  Diet_Type  \
0     Australia        189.8            Yes             No   High-Fat   
1            UK        235.4            Yes            Yes  High-Carb   
2  South Africa        143.7             No             No   Balanced   
3        Canada        219.5            Yes             No   Balanced   
4            UK        219.0             No            Yes   High-Fat   

   Sleep_Hours  Daily_Steps  Water_Intake_Liters Mental_Health_Status  \
0          8.5       4878.0                  3.7                 Good   
1          4.9       5604.0                  3.1                 Good   
2          7.0       7533.0                  2.6                 Good   
3          6.2       5963.0                  2.6                 Poor   
4          8.0       3201.0                  2.6              Average   

  Sleep_Quality Chronic_Disease Work_Stress_Level Alcohol_Consumption  \
0          Fair         Thyroid              High             Regular   
1          Fair   Heart Disease          Moderate          Occasional   
2          Good         Thyroid          Moderate             Regular   
3          Fair    Hypertension               Low          Occasional   
4          Good         Thyroid          Moderate             Regular   

  Diabetes_Status Diabetes_Type Fast_Food_Intake Prediabetes  \
0     No Diabetes        Type 2           Rarely          No   
1     No Diabetes        Type 2           Rarely         Yes   
2        Diabetes        Type 2        Sometimes          No   
3     No Diabetes        Type 2       Frequently          No   
4        Diabetes        Type 2       Frequently          No   

   Fasting_Blood_Sugar  Screen_Time  
0                185.4          8.9  
1                231.7          1.9  
2                 64.8         10.7  
3                 89.1          6.3  
4                137.7          4.7  
In [17]:
df.columns
Out[17]:
Index(['Age', 'Gender', 'BMI', 'Glucose', 'BloodPressure', 'Insulin',
       'SkinThickness', 'Pregnancies', 'DiabetesPedigreeFunction',
       'PhysicalActivityLevel', 'Outcome', 'Country', 'Cholesterol',
       'Smoking_Status', 'Family_History', 'Diet_Type', 'Sleep_Hours',
       'Daily_Steps', 'Water_Intake_Liters', 'Mental_Health_Status',
       'Sleep_Quality', 'Chronic_Disease', 'Work_Stress_Level',
       'Alcohol_Consumption', 'Diabetes_Status', 'Diabetes_Type',
       'Fast_Food_Intake', 'Prediabetes', 'Fasting_Blood_Sugar',
       'Screen_Time'],
      dtype='object')
In [18]:
df.dtypes
Out[18]:
Age                           int64
Gender                       object
BMI                         float64
Glucose                     float64
BloodPressure               float64
Insulin                     float64
SkinThickness               float64
Pregnancies                   int64
DiabetesPedigreeFunction    float64
PhysicalActivityLevel        object
Outcome                       int64
Country                      object
Cholesterol                 float64
Smoking_Status               object
Family_History               object
Diet_Type                    object
Sleep_Hours                 float64
Daily_Steps                 float64
Water_Intake_Liters         float64
Mental_Health_Status         object
Sleep_Quality                object
Chronic_Disease              object
Work_Stress_Level            object
Alcohol_Consumption          object
Diabetes_Status              object
Diabetes_Type                object
Fast_Food_Intake             object
Prediabetes                  object
Fasting_Blood_Sugar         float64
Screen_Time                 float64
dtype: object

Check Missing Values¶

In [19]:
missing = df.isnull().sum()
missing_percent = (missing / len(df)) * 100

missing_summary = pd.DataFrame({
    "Missing Values": missing,
    "Missing %": missing_percent.round(2)
})

print("\nMissing Values Summary:")
print(
    missing_summary[missing_summary["Missing Values"] > 0]
    .sort_values("Missing %", ascending=False)
)
Missing Values Summary:
                     Missing Values  Missing %
BMI                               5       0.03
Glucose                           5       0.03
Sleep_Hours                       5       0.03
Daily_Steps                       5       0.03
Water_Intake_Liters               5       0.03
In [20]:
plt.figure(figsize=(12,5))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Data Heatmap")
plt.show()
No description has been provided for this image
In [21]:
print("\nSummary Statistics (Numeric Columns):")
df.describe()
Summary Statistics (Numeric Columns):
Out[21]:
Age BMI Glucose BloodPressure Insulin SkinThickness Pregnancies DiabetesPedigreeFunction Outcome Cholesterol Sleep_Hours Daily_Steps Water_Intake_Liters Fasting_Blood_Sugar Screen_Time
count 18000.000000 17995.000000 17995.000000 18000.000000 18000.000000 18000.000000 18000.000000 18000.000000 18000.000000 18000.000000 17995.000000 17995.000000 17995.000000 18000.000000 18000.000000
mean 48.591056 28.015793 109.903351 75.112711 80.071211 19.982994 4.448167 1.301701 0.286389 199.731939 7.016605 6020.197888 2.500072 110.207017 6.524706
std 17.629407 5.899675 34.562562 11.836675 44.434725 7.871159 2.828718 0.681769 0.452086 39.580036 1.493874 2475.720091 0.797083 35.141720 3.177019
min 18.000000 4.900000 -46.300000 25.100000 -120.800000 -9.600000 0.000000 0.100000 0.000000 33.100000 1.300000 -3980.000000 -0.600000 -18.500000 1.000000
25% 34.000000 24.100000 87.100000 67.200000 51.175000 14.800000 2.000000 0.721000 0.000000 173.400000 6.000000 4361.500000 2.000000 86.200000 3.800000
50% 48.000000 28.200000 108.700000 75.250000 79.850000 20.000000 4.000000 1.294000 0.000000 199.800000 7.000000 6060.000000 2.500000 110.200000 6.500000
75% 64.000000 32.000000 132.900000 83.000000 108.800000 25.100000 7.000000 1.884000 1.000000 226.400000 8.000000 7652.500000 3.000000 134.300000 9.300000
max 79.000000 54.900000 240.500000 119.900000 249.200000 50.600000 9.000000 2.500000 1.000000 358.300000 12.200000 15152.000000 6.100000 242.800000 12.000000

Data Type Cleaning Summary¶

In [22]:
num_cols = df.select_dtypes(include=["int64","float64"]).columns
cat_cols = df.select_dtypes(include="object").columns

# Fill numeric with median
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical with mode
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

print("Missing values after filling:\n", df.isnull().sum().sum())
Missing values after filling:
 0
In [23]:
categorical_cols = df.select_dtypes(include="object").columns
print("\nCategorical Columns:", list(categorical_cols))

# Explore categorical columns
for col in categorical_cols:
    print(f"\nColumn: {col}")
    print("Unique values:", df[col].nunique())
    print(df[col].value_counts().head())
Categorical Columns: ['Gender', 'PhysicalActivityLevel', 'Country', 'Smoking_Status', 'Family_History', 'Diet_Type', 'Mental_Health_Status', 'Sleep_Quality', 'Chronic_Disease', 'Work_Stress_Level', 'Alcohol_Consumption', 'Diabetes_Status', 'Diabetes_Type', 'Fast_Food_Intake', 'Prediabetes']

Column: Gender
Unique values: 3
Gender
Female    9374
Male      8621
Other        5
Name: count, dtype: int64

Column: PhysicalActivityLevel
Unique values: 3
PhysicalActivityLevel
High        6383
Low         5811
Moderate    5806
Name: count, dtype: int64

Column: Country
Unique values: 8
Country
South Africa    2630
Brazil          2259
UK              2253
India           2205
Australia       2182
Name: count, dtype: int64

Column: Smoking_Status
Unique values: 3
Smoking_Status
No         13546
Yes         4449
unknown        5
Name: count, dtype: int64

Column: Family_History
Unique values: 2
Family_History
No     10891
Yes     7109
Name: count, dtype: int64

Column: Diet_Type
Unique values: 3
Diet_Type
High-Carb    6296
Balanced     5865
High-Fat     5839
Name: count, dtype: int64

Column: Mental_Health_Status
Unique values: 3
Mental_Health_Status
Good       9190
Average    5268
Poor       3542
Name: count, dtype: int64

Column: Sleep_Quality
Unique values: 4
Sleep_Quality
Good         4883
Excellent    4426
Poor         4368
Fair         4323
Name: count, dtype: int64

Column: Chronic_Disease
Unique values: 3
Chronic_Disease
Thyroid          9256
Heart Disease    4477
Hypertension     4267
Name: count, dtype: int64

Column: Work_Stress_Level
Unique values: 3
Work_Stress_Level
Low         6319
High        5879
Moderate    5802
Name: count, dtype: int64

Column: Alcohol_Consumption
Unique values: 2
Alcohol_Consumption
Regular       12129
Occasional     5871
Name: count, dtype: int64

Column: Diabetes_Status
Unique values: 2
Diabetes_Status
No Diabetes    12845
Diabetes        5155
Name: count, dtype: int64

Column: Diabetes_Type
Unique values: 2
Diabetes_Type
Type 2    17215
Type 1      785
Name: count, dtype: int64

Column: Fast_Food_Intake
Unique values: 4
Fast_Food_Intake
Rarely        7192
Sometimes     6341
Frequently    3570
Daily          897
Name: count, dtype: int64

Column: Prediabetes
Unique values: 2
Prediabetes
No     12236
Yes     5764
Name: count, dtype: int64
In [24]:
# Convert numeric-looking object columns to numeric
numeric_like_cols = ["BMI","Glucose","Sleep_Hours","Daily_Steps","Water_Intake_Liters","Fast_Food_Intake"]

for col in numeric_like_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

Outlier Check (Summary)¶

In [25]:
plt.figure(figsize=(16, 20))

for i, col in enumerate(num_cols, 1):
    plt.subplot(len(num_cols)//3 + 1, 3, i)
    sns.boxplot(data=df, y=col, color="skyblue")
    plt.title(col)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [26]:
plt.figure(figsize=(18, 6))

cols = ["BMI","Glucose","Sleep_Hours","Daily_Steps","Water_Intake_Liters","Fast_Food_Intake"]
sns.boxplot(data=df[cols])
plt.title("Boxplot Overview of Key Numeric Variables")
plt.show()
No description has been provided for this image

EDA¶

In [27]:
plt.figure(figsize=(7, 5))
sns.boxplot(data=df, x="Diabetes_Status", y="BloodPressure", palette="coolwarm")
plt.title("Blood Pressure by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("Blood Pressure")
plt.tight_layout()
plt.show()
No description has been provided for this image

Average Cholesterol by Gender (Bar Plot)¶

In [28]:
chol_gender = df.groupby("Gender", as_index=False)["Cholesterol"].mean()

plt.figure(figsize=(7, 5))
sns.barplot(data=chol_gender, x="Gender", y="Cholesterol", palette="viridis")
plt.title("Average Cholesterol by Gender", fontsize=14)
plt.ylabel("Cholesterol")

for i, v in enumerate(chol_gender["Cholesterol"]):
    plt.text(i, v + 1, f"{v:.1f}", ha="center")
    
plt.tight_layout()
plt.show()
No description has been provided for this image

Daily Steps vs BMI (Scatter Plot)¶

In [29]:
sample_df = df.sample(500, random_state=42)

plt.figure(figsize=(7, 5))
sns.scatterplot(data=sample_df,x="Daily_Steps",y="BMI",hue="Outcome",palette="coolwarm",alpha=0.6)
plt.title("Daily Steps vs BMI (Random Sample of 500 Rows)", fontsize=14)
plt.xlabel("Daily Steps")
plt.ylabel("BMI")
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample (2000 rows)¶

In [30]:
sample_df = df.sample(2000, random_state=42)

plt.figure(figsize=(7, 5))
sns.scatterplot(data=sample_df,x="Daily_Steps",y="BMI",hue="Outcome",palette="coolwarm",alpha=0.6)
plt.title("Daily Steps vs BMI (Random Sample of 2000 Rows)", fontsize=14)
plt.xlabel("Daily Steps")
plt.ylabel("BMI")
plt.tight_layout()
plt.show()
No description has been provided for this image

More Detailed¶

In [31]:
plt.figure(figsize=(7, 5))
sns.scatterplot(data=df, x="Daily_Steps", y="BMI", hue="Outcome", palette="coolwarm", alpha=0.6)
plt.title("Daily Steps vs BMI", fontsize=14)
plt.xlabel("Daily Steps")
plt.ylabel("BMI")
plt.tight_layout()
plt.show()
No description has been provided for this image

Sleep Hours Distribution (Histogram)¶

In [32]:
plt.figure(figsize=(7, 5))
sns.histplot(df["Sleep_Hours"], kde=True, bins=30, color="purple")
plt.title("Distribution of Sleep Hours", fontsize=14)
plt.xlabel("Sleep Hours")
plt.tight_layout()
plt.show()
No description has been provided for this image

Categorical Frequency Heatmap¶

In [33]:
cat_cols = df.select_dtypes(include="object").columns

freq_df = df[cat_cols].apply(lambda x: x.value_counts())
plt.figure(figsize=(12,7))
sns.heatmap(freq_df, cmap="viridis")
plt.title("Categorical Feature Frequency Heatmap")
plt.tight_layout()
plt.show()
No description has been provided for this image

BMI by Diet Type (Box Plot)¶

In [34]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Diet_Type", y="BMI", palette="magma")
plt.title("BMI by Diet Type")
plt.tight_layout()
plt.show()
No description has been provided for this image

UNIVARIATE ANALYSIS (Single variable)¶

Numeric Variables (Histograms)¶

In [35]:
df[num_cols].hist(figsize=(15,10), bins=30)
plt.suptitle("Distribution of Numerical Features")
plt.show()
No description has been provided for this image

Categorical Variables (Countplots)¶

In [36]:
import math

n = len(cat_cols)
rows = math.ceil(n / 3)

plt.figure(figsize=(20,20))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(rows, 3, i)
    sns.countplot(data=df, x=col, palette="viridis")
    plt.title(col)

plt.tight_layout()
plt.show()
No description has been provided for this image

Boxplots for Outlier Detection¶

In [37]:
# Select all numeric columns
num_cols = df.select_dtypes(include=["float64", "int64"]).columns

plt.figure(figsize=(18, 6))
sns.boxplot(data=df[num_cols], palette="Set3")
plt.title("Boxplot Overview of All Numeric Features", fontsize=14)
plt.tight_layout()
plt.show()
No description has been provided for this image

BIVARIATE ANALYSIS (Two variables)¶

In [38]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x="Mental_Health_Status", y="Sleep_Hours", palette="viridis")

plt.title("Sleep Hours by Mental Health Status", fontsize=14)
plt.xlabel("Mental Health Status", fontsize=12)
plt.ylabel("Sleep Hours", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

Average BMI by Smoking Status¶

In [39]:
bmi_smoking = (df.groupby("Smoking_Status", as_index=False)["BMI"].mean().sort_values("BMI", ascending=False))

plt.figure(figsize=(10, 5))
sns.barplot(data=bmi_smoking, x="Smoking_Status", y="BMI", palette="viridis")

for index, row in bmi_smoking.iterrows():
    plt.text(index, row["BMI"] + 0.2, round(row["BMI"], 1),ha='center', fontsize=10)

plt.title("Average BMI by Smoking Status", fontsize=14)
plt.xlabel("Smoking Status")
plt.ylabel("Average BMI")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [40]:
sns.boxplot(data=df, x="Diabetes_Type", y="Age", palette="coolwarm")
plt.title("Age by Diabetes Type")
plt.show()
No description has been provided for this image

Random Sample of 500 Rows¶

In [41]:
sample_df = df.sample(500, random_state=42)

plt.figure(figsize=(6,4))
sns.scatterplot(data=sample_df,x="BMI",y="Glucose",hue="Gender",alpha=0.6)
plt.title("BMI vs Glucose (Random Sample of 500 Rows)")
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample of 2000 Rows¶

In [42]:
sample_df = df.sample(2000, random_state=42)

plt.figure(figsize=(6,4))
sns.scatterplot(data=sample_df,x="BMI",y="Glucose",hue="Gender",alpha=0.6)
plt.title("BMI vs Glucose (Random Sample of 2000 Rows)")
plt.tight_layout()
plt.show()
No description has been provided for this image

More Detailed¶

In [43]:
plt.figure(figsize=(6,4))
sns.scatterplot(data=df, x="BMI", y="Glucose", hue="Gender", alpha=0.6)
plt.title("BMI vs Glucose")
plt.tight_layout()
plt.show()
No description has been provided for this image

Glucose Levels by Gender¶

In [44]:
# Average Glucose Level by Gender
glucose_by_gender = (df.groupby("Gender", as_index=False)["Glucose"].mean().sort_values("Glucose", ascending=False))

plt.figure(figsize=(10, 5))
sns.barplot(data=glucose_by_gender, x="Gender", y="Glucose", palette="viridis")
plt.title("Average Glucose Level by Gender", fontsize=14)
plt.xlabel("Gender", fontsize=12)
plt.ylabel("Average Glucose Level", fontsize=12)

for index, row in glucose_by_gender.iterrows():
    plt.text(index, row["Glucose"] + 1, round(row["Glucose"], 1),ha='center', fontsize=10)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [45]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x="Gender", y="Glucose", palette="coolwarm")
plt.title("Glucose by Gender")
plt.tight_layout()
plt.show()
No description has been provided for this image

Diabetes Outcome vs Age¶

In [46]:
plt.figure(figsize=(8,5))
sns.histplot(data=df, x="Age", hue="Diabetes_Status", kde=True, palette="coolwarm")
plt.title("Age Distribution by Diabetes Status")
plt.xlabel("Age")
plt.tight_layout()
plt.show()
No description has been provided for this image

BMI by Smoking Status¶

In [47]:
plt.figure(figsize=(10,5))
sns.boxplot(data=df, x="Smoking_Status", y="BMI", palette="Set2")
plt.title("BMI by Smoking Status")
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample of 500 Rows¶

In [48]:
sample_df = df.sample(500, random_state=42)

plt.figure(figsize=(6, 4))
sns.scatterplot(data=sample_df,x="Glucose", y="Insulin",alpha=0.7, edgecolor=None, color="teal")
plt.title("Glucose vs Insulin (Sample of 500 Rows)", fontsize=14)
plt.xlabel("Glucose Level", fontsize=12)
plt.ylabel("Insulin Level", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample of 2500 Rows¶

In [49]:
sample_df = df.sample(2500, random_state=42)

plt.figure(figsize=(6, 4))
sns.scatterplot(data=sample_df,x="Glucose", y="Insulin",alpha=0.7, edgecolor=None, color="teal")
plt.title("Glucose vs Insulin (Sample of 2500 Rows)", fontsize=14)
plt.xlabel("Glucose Level", fontsize=12)
plt.ylabel("Insulin Level", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

More Detailed¶

In [50]:
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df,x="Glucose", y="Insulin",alpha=0.7, edgecolor=None, color="teal")

plt.title("Glucose vs Insulin", fontsize=14)
plt.xlabel("Glucose Level", fontsize=12)
plt.ylabel("Insulin Level", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

Compares Glucose vs Diabetes Outcome¶

In [51]:
plt.figure(figsize=(6, 4))
sns.boxplot(data=df, x="Diabetes_Status", y="Glucose", palette="coolwarm")
plt.title("Glucose Levels by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status", fontsize=12)
plt.ylabel("Glucose Level", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

Relationship between Age and BMI (Regplot)¶

Random Sample of 500¶

In [52]:
sample_500 = df.sample(500, random_state=42)

plt.figure(figsize=(6, 4))
sns.regplot(data=sample_500, x="Age", y="BMI", scatter_kws={"alpha":0.6}, line_kws={"color":"red"})
plt.title("Age vs BMI (Sample of 500)", fontsize=14)
plt.xlabel("Age", fontsize=12)
plt.ylabel("BMI", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample of 2500¶

In [53]:
sample_2500 = df.sample(2500, random_state=42)

plt.figure(figsize=(6, 4))
sns.regplot(data=sample_2500, x="Age", y="BMI",scatter_kws={"alpha":0.6}, line_kws={"color":"red"})
plt.title("Age vs BMI (Sample of 2500)", fontsize=14)
plt.xlabel("Age", fontsize=12)
plt.ylabel("BMI", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image

MULTIVARIATE ANALYSIS (Three or more variables)¶

Correlation Heatmap (All numeric variables)¶

In [54]:
plt.figure(figsize=(14,10))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
No description has been provided for this image

Pairplot of Top Health Indicators¶

with Random Sample of 500 Rows¶

In [55]:
sample_500 = df.sample(500, random_state=42)
sns.pairplot(sample_500[["Age", "BMI", "Glucose", "Insulin", "Cholesterol", "Diabetes_Status"]],hue="Diabetes_Status",diag_kind="kde")
plt.show()
No description has been provided for this image
In [56]:
sns.pairplot(df[["Age", "BMI", "Glucose", "Insulin", "Cholesterol", "Diabetes_Status"]],
             hue="Diabetes_Status", diag_kind="kde")
plt.show()
No description has been provided for this image

Water Intake vs BP colored by Diabetes Outcome¶

Random Sample of 500 Rows¶

In [57]:
sample_df = df.sample(500, random_state=42)

plt.figure(figsize=(8,5))
sns.scatterplot(data=sample_df,x="Water_Intake_Liters",y="BloodPressure",hue="Diabetes_Status",alpha=0.7,palette="viridis")
plt.title("Water Intake vs Blood Pressure (Sample of 500 Rows)")
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample of 1500 Rows¶

In [58]:
sample_df = df.sample(1500, random_state=42)

plt.figure(figsize=(8,5))
sns.scatterplot(data=sample_df,x="Water_Intake_Liters",y="BloodPressure",hue="Diabetes_Status",alpha=0.7,palette="viridis")
plt.title("Water Intake vs Blood Pressure (Sample of 1500 Rows)")
plt.tight_layout()
plt.show()
No description has been provided for this image

More Detailed¶

In [59]:
plt.figure(figsize=(8,5))
sns.scatterplot(data=df, x="Water_Intake_Liters", y="BloodPressure",hue="Diabetes_Status", alpha=0.6, palette="viridis")
plt.title("Water Intake vs Blood Pressure")
plt.tight_layout()
plt.show()
No description has been provided for this image

Diet Type + BMI + Diabetes Outcome (Boxplot)¶

In [60]:
plt.figure(figsize=(10,5))
sns.boxplot(data=df, x="Diet_Type", y="BMI", hue="Diabetes_Status", palette="viridis")
plt.title("Diet Type vs BMI vs Diabetes_Status")
plt.tight_layout()
plt.show()
No description has been provided for this image

Glucose vs Insulin¶

Random Sample of 500 Rows¶

In [61]:
sample_500 = df.sample(500, random_state=42)

plt.figure(figsize=(6,4))
sns.scatterplot(data=sample_500,x="Glucose", y="Insulin", hue="Diabetes_Status", size="Age", alpha=0.6, palette="viridis")
plt.title("Glucose vs Insulin (Sample 500 | Colored by Diabetes_Status, Sized by Age)")
plt.tight_layout()
plt.show()
No description has been provided for this image

Random Sample of 2500 Rows¶

In [62]:
sample_500 = df.sample(2500, random_state=42)

plt.figure(figsize=(6,4))
sns.scatterplot(data=sample_500,x="Glucose", y="Insulin", hue="Diabetes_Status", size="Age", alpha=0.6, palette="viridis")
plt.title("Glucose vs Insulin (Sample 2500 | Colored by Diabetes_Status, Sized by Age)")
plt.tight_layout()
plt.show()
No description has been provided for this image

More Detailed¶

In [63]:
plt.figure(figsize=(6,4)) 
sns.scatterplot(data=df, x="Glucose", y="Insulin", hue="Diabetes_Status", size="Age", alpha=0.6, palette="viridis") 
plt.title("Glucose vs Insulin (Colored by Diabetes_Status, Sized by Age)") 
plt.tight_layout() 
plt.show()
No description has been provided for this image

Feature Correlation with Diabetes Outcome¶

In [64]:
corr_with_outcome = (df.corr(numeric_only=True)["Outcome"].drop("Outcome").sort_values(ascending=False))
corr_with_outcome.index.name = "" 

plt.figure(figsize=(8, 5))
sns.barplot(x=corr_with_outcome.values,y=corr_with_outcome.index,palette="magma")
plt.title("Feature Correlation with Diabetes Status", fontsize=14)
plt.xlabel("Correlation with Diabetes Status")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (confusion_matrix, classification_report,
                             accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, roc_curve)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

Select Target Variable

In [65]:
df["Diabetes_Status"] = df["Diabetes_Status"].map({"No Diabetes": 0, "Diabetes": 1})
y = df["Diabetes_Status"]

Feature Selection

In [66]:
X = df.drop(columns=["Diabetes_Status", "Outcome", "Country"])

Encode Categorical Variables

In [67]:
from sklearn.preprocessing import OneHotEncoder

X = pd.get_dummies(X, drop_first=True)

Train-Test Split

In [97]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Feature Scaling

In [98]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [101]:
from sklearn.impute import SimpleImputer
In [102]:
imputer = SimpleImputer(strategy="median")

X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
In [103]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
In [107]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]
In [108]:
cm_log = confusion_matrix(y_test, y_pred_log)

plt.figure(figsize=(5,4))
sns.heatmap(cm_log, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
No description has been provided for this image
In [109]:
print("Logistic Regression Classification Report:\n")
print(classification_report(y_test, y_pred_log))

print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1 Score:", f1_score(y_test, y_pred_log))
print("ROC AUC:", roc_auc_score(y_test, y_prob_log))
Logistic Regression Classification Report:

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      2569
           1       0.00      0.00      0.00      1031

    accuracy                           0.71      3600
   macro avg       0.36      0.50      0.42      3600
weighted avg       0.51      0.71      0.59      3600

Accuracy: 0.7136111111111111
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC AUC: 0.4927285296335212

DecisionTree Classifier

In [74]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=6, random_state=42)
dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
print(classification_report(y_test, y_pred_dt))
              precision    recall  f1-score   support

           0       0.71      0.99      0.83      2569
           1       0.24      0.01      0.02      1031

    accuracy                           0.71      3600
   macro avg       0.48      0.50      0.42      3600
weighted avg       0.58      0.71      0.60      3600

In [110]:
cm_dt = confusion_matrix(y_test, y_pred_dt)

plt.figure(figsize=(5,4))
sns.heatmap(cm_dt, annot=True, fmt="d", cmap="Greens")
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
No description has been provided for this image
In [113]:
print("Decision Tree Classification Report:\n")
print(classification_report(y_test, y_pred_dt))

print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("ROC AUC:", roc_auc_score(y_test, y_pred_dt))
Decision Tree Classification Report:

              precision    recall  f1-score   support

           0       0.71      0.99      0.83      2569
           1       0.24      0.01      0.02      1031

    accuracy                           0.71      3600
   macro avg       0.48      0.50      0.42      3600
weighted avg       0.58      0.71      0.60      3600

Accuracy: 0.7088888888888889
Precision: 0.24242424242424243
Recall: 0.007759456838021339
F1 Score: 0.015037593984962405
ROC AUC: 0.49901402191842675

RandomForestClassifier

In [75]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
              precision    recall  f1-score   support

           0       0.71      1.00      0.83      2569
           1       0.00      0.00      0.00      1031

    accuracy                           0.71      3600
   macro avg       0.36      0.50      0.42      3600
weighted avg       0.51      0.71      0.59      3600

In [114]:
cm_rf = confusion_matrix(y_test, y_pred_rf)

plt.figure(figsize=(5,4))
sns.heatmap(cm_rf, annot=True, fmt="d", cmap="Oranges")
plt.title("Confusion Matrix - Random Forest")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
No description has been provided for this image
In [116]:
print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rf))

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_pred_rf))
Random Forest Classification Report:

              precision    recall  f1-score   support

           0       0.71      1.00      0.83      2569
           1       0.00      0.00      0.00      1031

    accuracy                           0.71      3600
   macro avg       0.36      0.50      0.42      3600
weighted avg       0.51      0.71      0.59      3600

Accuracy: 0.7136111111111111
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
ROC AUC: 0.5

Model Evaluation Metrics

In [76]:
from sklearn.metrics import roc_auc_score

print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:,1]))
ROC AUC: 0.4977764051650678
In [118]:
fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_log)
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_dt)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf)

plt.figure(figsize=(8,6))
plt.plot(fpr_log, tpr_log, label="Logistic Regression")
plt.plot(fpr_dt, tpr_dt, label="Decision Tree")
plt.plot(fpr_rf, tpr_rf, label="Random Forest")
plt.plot([0,1], [0,1], linestyle="--")

plt.title("ROC Curve Comparison")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()
No description has been provided for this image
In [120]:
results = pd.DataFrame({
    "Model": ["Logistic Regression", "Decision Tree", "Random Forest"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_log),
        accuracy_score(y_test, y_pred_dt),
        accuracy_score(y_test, y_pred_rf)
    ],
    "Precision": [
        precision_score(y_test, y_pred_log),
        precision_score(y_test, y_pred_dt),
        precision_score(y_test, y_pred_rf)
    ],
    "Recall": [
        recall_score(y_test, y_pred_log),
        recall_score(y_test, y_pred_dt),
        recall_score(y_test, y_pred_rf)
    ],
    "F1 Score": [
        f1_score(y_test, y_pred_log),
        f1_score(y_test, y_pred_dt),
        f1_score(y_test, y_pred_rf)
    ],
    "ROC AUC": [
        roc_auc_score(y_test, y_prob_log),
        roc_auc_score(y_test, y_pred_dt),
        roc_auc_score(y_test, y_pred_rf)
    ]
})

print(results.sort_values(by="ROC AUC", ascending=False))
                 Model  Accuracy  Precision    Recall  F1 Score   ROC AUC
2        Random Forest  0.713611   0.000000  0.000000  0.000000  0.500000
1        Decision Tree  0.708889   0.242424  0.007759  0.015038  0.499014
0  Logistic Regression  0.713611   0.000000  0.000000  0.000000  0.492729
In [122]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
Out[122]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [123]:
importances = rf_model.feature_importances_
feature_names = X.columns

feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False).head(15)

plt.figure(figsize=(8,6))
sns.barplot(data=feat_imp, x="Importance", y="Feature")
plt.title("Top 15 Feature Importances - Random Forest")
plt.show()
No description has been provided for this image